import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.express as px
df = pd.read_csv("https://docs.google.com/spreadsheets/d/17_svn8lKuMPh4sl01a8Fca656yLRwbkYD2osTgmrvi8/export?format=csv")
df.head()
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FDA15 | 9.30 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.1380 |
| 1 | DRC01 | 5.92 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
| 2 | FDN15 | 17.50 | Low Fat | 0.016760 | Meat | 141.6180 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.2700 |
| 3 | FDX07 | 19.20 | Regular | 0.000000 | Fruits and Vegetables | 182.0950 | OUT010 | 1998 | NaN | Tier 3 | Grocery Store | 732.3800 |
| 4 | NCD19 | 8.93 | Low Fat | 0.000000 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4
## Thanks to Greg Hogg https://youtu.be/qNF1HqBvpGE for this solution to Plotly export issues
--2022-06-14 16:23:27-- https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage Resolving github.com (github.com)... 140.82.114.3 Connecting to github.com (github.com)|140.82.114.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220614%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220614T162327Z&X-Amz-Expires=300&X-Amz-Signature=28cb11c78afd453a353c0dac8ea2f97e96f26e1d47dcc2703ebec06a1b822cc4&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream [following] --2022-06-14 16:23:27-- https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220614%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220614T162327Z&X-Amz-Expires=300&X-Amz-Signature=28cb11c78afd453a353c0dac8ea2f97e96f26e1d47dcc2703ebec06a1b822cc4&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ... Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.110.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 51607939 (49M) [application/octet-stream] Saving to: ‘/usr/local/bin/orca’ /usr/local/bin/orca 100%[===================>] 49.22M 128MB/s in 0.4s 2022-06-14 16:23:28 (128 MB/s) - ‘/usr/local/bin/orca’ saved [51607939/51607939] Reading package lists... Done Building dependency tree Reading state information... Done libgtk2.0-0 is already the newest version (2.24.32-1ubuntu1). libgconf-2-4 is already the newest version (3.2.6-4ubuntu1). xvfb is already the newest version (2:1.19.6-1ubuntu4.10). The following package was automatically installed and is no longer required: libnvidia-common-460 Use 'apt autoremove' to remove it. 0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null object 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null object 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null object 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null object 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 6113 non-null object 9 Outlet_Location_Type 8523 non-null object 10 Outlet_Type 8523 non-null object 11 Item_Outlet_Sales 8523 non-null float64 dtypes: float64(4), int64(1), object(7) memory usage: 799.2+ KB
df.duplicated().sum()
0
df.isna().sum()
Item_Identifier 0 Item_Weight 1463 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
avg_weights_by_type = df.groupby("Item_Type")["Item_Weight"].mean()
for item_type in df["Item_Type"].unique():
df.loc[df["Item_Type"] == item_type,["Item_Weight"]] = \
df.loc[df["Item_Type"] == item_type,["Item_Weight"]].fillna(avg_weights_by_type[item_type])
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df["Outlet_Size"].unique()
array(['Medium', nan, 'High', 'Small'], dtype=object)
df.groupby("Outlet_Identifier")[["Outlet_Size","Outlet_Location_Type"]].count()
| Outlet_Size | Outlet_Location_Type | |
|---|---|---|
| Outlet_Identifier | ||
| OUT010 | 0 | 555 |
| OUT013 | 932 | 932 |
| OUT017 | 0 | 926 |
| OUT018 | 928 | 928 |
| OUT019 | 528 | 528 |
| OUT027 | 935 | 935 |
| OUT035 | 930 | 930 |
| OUT045 | 0 | 929 |
| OUT046 | 930 | 930 |
| OUT049 | 930 | 930 |
df["Outlet_Size"].fillna("Unknown", inplace=True)
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 0 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df.loc[df["Outlet_Size"]=="High",["Outlet_Size"]] ="Large"
df["Outlet_Size"].unique()
array(['Medium', 'Unknown', 'Large', 'Small'], dtype=object)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 5 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Fat_Content"].unique()
array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)
df["Item_Fat_Content"].replace(["low fat","LF"],"Low Fat", inplace=True)
df["Item_Fat_Content"].replace("reg","Regular", inplace=True)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 2 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Type"].unique()
array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 |
| mean | 12.857890 | 0.066132 | 140.992782 | 1997.831867 | 2181.288914 |
| std | 4.232804 | 0.051598 | 62.275067 | 8.371760 | 1706.499616 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.310000 | 0.026989 | 93.826500 | 1987.000000 | 834.247400 |
| 50% | 12.867061 | 0.053931 | 143.012800 | 1999.000000 | 1794.331000 |
| 75% | 16.000000 | 0.094585 | 185.643700 | 2004.000000 | 3101.296400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 13086.964800 |
fig, ax = plt.subplots()
ax.boxplot([df["Item_Outlet_Sales"]],labels = ["Item Sales"],
notch=True,
patch_artist=True)
plt.show()
sales = df['Item_Outlet_Sales']
q1 = sales.quantile(0.25)
q3 = sales.quantile(0.75)
iqr = q3 - q1
outliers = df[sales > q3 + iqr * 1.5]
outliers
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 43 | FDC02 | 21.350000 | Low Fat | 0.069103 | Canned | 259.9278 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 6768.5228 |
| 130 | FDY25 | 12.305705 | Low Fat | 0.033810 | Canned | 180.5976 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 7968.2944 |
| 132 | NCR53 | 13.142314 | Low Fat | 0.144338 | Health and Hygiene | 224.4404 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6976.2524 |
| 145 | FDP16 | 18.600000 | Low Fat | 0.039356 | Frozen Foods | 246.3802 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7370.4060 |
| 203 | FDI24 | 12.277108 | Low Fat | 0.078362 | Baking Goods | 177.9370 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6704.6060 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8245 | FDU55 | 16.200000 | Low Fat | 0.035967 | Fruits and Vegetables | 260.3278 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7549.5062 |
| 8329 | NCQ06 | 13.384736 | Low Fat | 0.041622 | Household | 253.6014 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6630.0364 |
| 8350 | NCE18 | 10.000000 | Low Fat | 0.021421 | Household | 248.3750 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7240.5750 |
| 8447 | FDS26 | 20.350000 | Low Fat | 0.089975 | Dairy | 261.6594 | OUT017 | 2007 | Unknown | Tier 2 | Supermarket Type1 | 7588.1226 |
| 8510 | FDN58 | 13.800000 | Regular | 0.056862 | Snack Foods | 231.5984 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7182.6504 |
186 rows × 12 columns
df.drop(outliers.index,axis=0 ,inplace=True)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 |
| mean | 12.856296 | 0.066358 | 139.161087 | 1997.968094 | 2059.362844 |
| std | 4.256788 | 0.051838 | 61.553351 | 8.309941 | 1506.099754 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.300000 | 0.027028 | 93.046200 | 1987.000000 | 810.944400 |
| 50% | 12.867061 | 0.053939 | 141.215400 | 1999.000000 | 1747.059200 |
| 75% | 16.100000 | 0.095299 | 183.695000 | 2004.000000 | 2998.097400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 6478.234000 |
sns.heatmap(df.corr(),cmap="Blues",annot= True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f75c9874a50>
x = df['Item_Visibility']
y = df['Item_Outlet_Sales']
a,b = np.polyfit(x,y,1)
fig, ax = plt.subplots()
ax.scatter(x,y, label = "datapoint")
ax.plot(x,a*x+b, "r-", label="Best Fit Line")
ax.legend()
ax.set_title("Total Sales Revenue by Visibility")
ax.set_xlabel("Visiblity")
ax.set_ylabel("Sales Revenue")
plt.show()
histogram = df.groupby("Outlet_Size")["Item_Outlet_Sales"].hist(alpha=0.35,legend=True)
large = df[df["Outlet_Size"]=="Large"]["Item_Outlet_Sales"]
med = df[df["Outlet_Size"]=="Medium"]["Item_Outlet_Sales"]
small = df[df["Outlet_Size"]=="Small"]["Item_Outlet_Sales"]
uk = df[df["Outlet_Size"]=="Unknown"]["Item_Outlet_Sales"]
fig, ax = plt.subplots()
bps = ax.boxplot([small, med, large, uk],labels=["Small","Medium","Large","Unknown"], notch=True,
patch_artist=True)
plt.show()
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
df.groupby("Outlet_Size")["Item_Outlet_Sales"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Outlet_Size | ||||||||
| Large | 917.0 | 2217.100205 | 1398.677744 | 73.2380 | 1057.95620 | 2014.7108 | 3103.95960 | 6474.2392 |
| Medium | 2676.0 | 2458.690424 | 1530.464302 | 69.2432 | 1238.22155 | 2135.8864 | 3448.84400 | 6478.2340 |
| Small | 2362.0 | 1852.606258 | 1483.026265 | 33.9558 | 593.22780 | 1517.3582 | 2775.55375 | 6474.2392 |
| Unknown | 2382.0 | 1755.044328 | 1432.707736 | 33.2900 | 549.28500 | 1417.4882 | 2631.24160 | 6471.5760 |
fig = px.histogram(df,x="Outlet_Identifier", y="Item_Outlet_Sales",
title="Total Sales Per Outlet",
hover_data=['Outlet_Size'],
color ="Outlet_Size",
labels = {"Outlet_Size": "Outlet Size",
"Outlet_Identifier": "Outlet ID",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Size":["Small",
"Medium",
"Large",
"Unknown"]})
py.iplot(fig)
fig = px.histogram(df,x="Item_Type", y="Item_Outlet_Sales",
title="Total Sales Per Item Type",
hover_data=['Outlet_Type'],
color ="Outlet_Type",
labels = {"Outlet_Type": "Outlet Type",
"Item_Type": "Item Type",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Type":["Supermarket Type1",
"Supermarket Type2",
"Supermarket Type3",
"Grocery Store"]},)
py.iplot(fig)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
y = df["Item_Outlet_Sales"]
numeric_features = ["Item_Weight",
"Item_Visibility",
"Item_MRP",
"Outlet_Establishment_Year"]
ordinal_encoded_features = ["Outlet_Size",
"Outlet_Type",
"Outlet_Identifier",
"Item_Type",
"Item_Fat_Content"]
# Include all fatures to be included in the model
X = df[numeric_features + ordinal_encoded_features]
X.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 9.30 | 0.016047 | 249.8092 | 1999 | Medium | Supermarket Type1 | OUT049 | Dairy | Low Fat |
| 1 | 5.92 | 0.019278 | 48.2692 | 2009 | Medium | Supermarket Type2 | OUT018 | Soft Drinks | Regular |
| 2 | 17.50 | 0.016760 | 141.6180 | 1999 | Medium | Supermarket Type1 | OUT049 | Meat | Low Fat |
| 3 | 19.20 | 0.000000 | 182.0950 | 1998 | Unknown | Grocery Store | OUT010 | Fruits and Vegetables | Regular |
| 4 | 8.93 | 0.000000 | 53.8614 | 1987 | Large | Supermarket Type1 | OUT013 | Household | Low Fat |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)
ord_enc = OrdinalEncoder()
X_train[ordinal_encoded_features] = ord_enc.fit_transform(X_train[ordinal_encoded_features],y_train)
X_valid[ordinal_encoded_features] = ord_enc.transform(X_valid[ordinal_encoded_features])
X_train.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | |
|---|---|---|---|---|---|---|---|---|---|
| 6613 | 8.630000 | 0.032961 | 115.0518 | 1999 | 1.0 | 1.0 | 9.0 | 13.0 | 1.0 |
| 3895 | 20.100000 | 0.000000 | 61.3536 | 1998 | 3.0 | 0.0 | 0.0 | 8.0 | 0.0 |
| 5894 | 12.817344 | 0.000000 | 98.2042 | 1985 | 2.0 | 0.0 | 4.0 | 10.0 | 1.0 |
| 5726 | 21.100000 | 0.000000 | 233.2958 | 2009 | 1.0 | 2.0 | 3.0 | 9.0 | 0.0 |
| 1771 | 13.500000 | 0.021496 | 180.5976 | 1997 | 2.0 | 1.0 | 8.0 | 6.0 | 0.0 |
model = RandomForestRegressor(random_state=1) # random state is so that testing can be measured reliably
model.fit(X_train,y_train)
RandomForestRegressor(random_state=1)
def get_MAE(model,X,y):
return mean_absolute_error(model.predict(X),y)
def get_percent(model,X,y):
return 1-(get_MAE(model,X,y)/y.mean())
def print_stats(model,X,y):
print("The MAE for this model is {:.2f}".format(get_MAE(model,X,y)))
print("The accuracy for this model is {:%}".format(get_percent(model,X,y)))
print_stats(model,X_valid,y_valid)
The MAE for this model is 744.32 The accuracy for this model is 63.953409%
results = {}
for i in range(1,7):
test_model = RandomForestRegressor(n_estimators=i*50,random_state=1)
test_model.fit(X_train,y_train)
results[i*50] = get_MAE(test_model,X_valid,y_valid)
print(f"{i*50}:")
print_stats(test_model,X_valid,y_valid)
50: The MAE for this model is 743.92 The accuracy for this model is 63.972851% 100: The MAE for this model is 744.32 The accuracy for this model is 63.953409% 150: The MAE for this model is 743.67 The accuracy for this model is 63.984932% 200: The MAE for this model is 742.69 The accuracy for this model is 64.032610% 250: The MAE for this model is 741.80 The accuracy for this model is 64.075361% 300: The MAE for this model is 741.77 The accuracy for this model is 64.077056%
plt.plot([*results.keys()],[*results.values()] )
plt.gca().invert_yaxis()
plt.show()
from xgboost.sklearn import XGBRegressor
model2 = XGBRegressor(n_estimators = 2000,
learning_rate=0.005,
early_stopping_rounds = 10,
random_state= 1)
model2.fit(X_train,y_train,
eval_set= [(X_valid, y_valid)],
verbose = False)
print_stats(model2,X_valid,y_valid)
[16:24:24] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. The MAE for this model is 706.50 The accuracy for this model is 65.784921%
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
model3 = keras.Sequential([
layers.BatchNormalization(input_shape = [X_train.shape[1]]),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.BatchNormalization(),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.Dense(1)
])
early_stopping = callbacks.EarlyStopping(
min_delta = 0.001,
patience = 20,
restore_best_weights = True
)
model3.compile(
optimizer="adam",
loss="mae",
metrics=['mae'],
)
history = model3.fit(
X_train, y_train,
validation_data=(X_valid, y_valid),
batch_size=128,
callbacks=[early_stopping],
epochs=100,
verbose=0,
)
history_df = pd.DataFrame(history.history)
history_df['val_loss'].plot()
best_results = history_df['val_loss'].min()
print("Minimum Validation Loss: {:0.4f}".format(best_results))
print("Highest Accuracy: {:0.4%}".format(1-(best_results/df['Item_Outlet_Sales'].mean())))
Minimum Validation Loss: 700.5079 Highest Accuracy: 65.9842%
from sklearn.model_selection import GridSearchCV
param_grid = {
"n_estimators": [x for x in range(100,251,50)],
"max_features": [1.0,"sqrt","log2"],
"min_weight_fraction_leaf": [0.0, 0.1, 0.01],
}
grid_search = GridSearchCV( estimator = RandomForestRegressor(),
param_grid = param_grid,
scoring = "neg_mean_absolute_error",
n_jobs = 4,
verbose = 2)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
GridSearchCV(estimator=RandomForestRegressor(), n_jobs=4,
param_grid={'max_features': [1.0, 'sqrt', 'log2'],
'min_weight_fraction_leaf': [0.0, 0.1, 0.01],
'n_estimators': [100, 150, 200, 250]},
scoring='neg_mean_absolute_error', verbose=2)
print_stats(grid_search,X_valid,y_valid)
The MAE for this model is 702.43 The accuracy for this model is 65.982081%
grid_search.best_params_
{'max_features': 1.0, 'min_weight_fraction_leaf': 0.01, 'n_estimators': 100}
from joblib import dump
dump(grid_search,"model.joblib")
['model.joblib']